{
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7-final"
  },
  "orig_nbformat": 2,
  "kernelspec": {
   "name": "python3",
   "display_name": "DataAnalysis"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2,
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pprint\n",
    "import re\n",
    "\n",
    "import nltk\n",
    "from tools import show_subtitle\n",
    "%matplotlib inline"
   ]
  },
  {
   "source": [
    "# Chap4 编写结构化的程序\n",
    "1.  怎样才能写出结构良好，可读性强的程序，从而方便重用？\n",
    "2.  基本的结构块，例如：循环、函数和赋值是如何执行的？\n",
    "3.  Python 编程的陷阱还有哪些，如何避免它们？"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "source": [
    "## 4.7 算法设计(P175)"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "source": [
    "### 4.7.1 递归与迭代(P176)"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "factorial1(10)=  3628800\nfactorial2(10)=  3628800\n"
     ]
    }
   ],
   "source": [
    "# 两种解决方案各有利弊。递归更容易理解，迭代速度更快\n",
    "def factorial1(n):\n",
    "    result = 1\n",
    "    for i in range(n):\n",
    "        result *= (i + 1)\n",
    "    return result\n",
    "\n",
    "\n",
    "def factorial2(n):\n",
    "    if n == 1:\n",
    "        return 1\n",
    "    else:\n",
    "        return n * factorial2(n - 1)\n",
    "\n",
    "print(\"factorial1(10)= \",factorial1(10))\n",
    "print(\"factorial2(10)= \",factorial2(10))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "size1(dog)=  190\nsize2(dog)=  190\n"
     ]
    }
   ],
   "source": [
    "def size1(s):\n",
    "    return 1 + sum(size1(child) for child in s.hyponyms())\n",
    "\n",
    "\n",
    "def size2(s):\n",
    "    layer = [s]\n",
    "    total = 0\n",
    "    while layer:\n",
    "        total += len(layer)\n",
    "        layer = [h for c in layer for h in c.hyponyms()]\n",
    "    return total\n",
    "\n",
    "\n",
    "from nltk.corpus import wordnet as wn\n",
    "\n",
    "dog = wn.synset('dog.n.01')\n",
    "print(\"size1(dog)= \",size1(dog))\n",
    "print(\"size2(dog)= \",size2(dog))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "{'c': {'h': {'a': {'i': {'r': {'value': 'flesh'}},\n                   't': {'value': 'cat'}},\n             'i': {'c': {'value': 'stylist'},\n                   'e': {'n': {'value': 'dog'}}}}}}\ntrie['c']['h']['a']['t']['value']= cat\n"
     ]
    }
   ],
   "source": [
    "# Ex4-6 构建一个字母查找树\n",
    "# 一个递归函数建立一个嵌套的字典结构，每一级嵌套包含给定前缀的所有单词\n",
    "# 而子查找树含有所有可能的后续词\n",
    "def insert(trie, key, value):\n",
    "    if key:\n",
    "        first, rest = key[0], key[1:]\n",
    "        if first not in trie:\n",
    "            trie[first] = {}\n",
    "        insert(trie[first], rest, value)\n",
    "    else:\n",
    "        trie['value'] = value\n",
    "\n",
    "\n",
    "trie = {}\n",
    "insert(trie, 'chat', 'cat')\n",
    "insert(trie, 'chien', 'dog')\n",
    "insert(trie, 'chair', 'flesh')\n",
    "insert(trie, 'chic', 'stylist')\n",
    "trie = dict(trie)\n",
    "pprint.pprint(trie, width=40)\n",
    "print(\"trie['c']['h']['a']['t']['value']=\", trie['c']['h']['a']['t']['value'])"
   ]
  },
  {
   "source": [
    "### 4.7.2 空间与时间的平衡(P179)"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Building Index...\n",
      "s funded by her mother . lucy quit working professionally 10\n",
      "erick . i disliked that movie quite a bit , but since \" prac\n",
      "t disaster . babe ruth didn't quit baseball after one season\n",
      "o-be fiance . i think she can quit that job and get a more r\n",
      " and rose mcgowan should just quit acting . she has no chari\n",
      "and get a day job . and don't quit it .                     \n",
      " kubrick , alas , should have quit while he was ahead . this\n",
      "everyone involved should have quit while they were still ahe\n",
      "l die . so what does joe do ? quit his job , of course ! ! w\n",
      "red \" implant . he's ready to quit the biz and get a portion\n"
     ]
    }
   ],
   "source": [
    "# Ex4-7: 一个简单的全文检索系统\n",
    "# 通过对文档索引集合，提高搜索速度，然后再对文档展开搜索，减少搜索准备\n",
    "def raw(file):\n",
    "    contents = open(file).read()\n",
    "    contents = re.sub(r'<.*?>', ' ', contents)\n",
    "    contents = re.sub(r'\\s+', ' ', contents)\n",
    "    return contents\n",
    "\n",
    "\n",
    "def snippet(doc, term):\n",
    "    text = ' ' * 30 + raw(doc) + ' ' * 30\n",
    "    pos = text.index(term)\n",
    "    return text[pos - 30:pos + 30]\n",
    "\n",
    "\n",
    "print('Building Index...')\n",
    "files = nltk.corpus.movie_reviews.abspaths()\n",
    "idx = nltk.Index(\n",
    "        (w, f)\n",
    "        for f in files\n",
    "        for w in raw(f).split()\n",
    ")\n",
    "\n",
    "query = ''\n",
    "while query != 'quit':\n",
    "    query = input('query> ')\n",
    "    if query in idx:\n",
    "        for i, doc in enumerate(idx[query]):\n",
    "            if i < 10:\n",
    "                print(snippet(doc, query))\n",
    "    else:\n",
    "        print('Not found')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "0.0022696566730238824\n0.003698164120670158\n"
     ]
    }
   ],
   "source": [
    "# Ex4-8 预处理已经标的语料库数据，将所有的词和标都转换成整数\n",
    "def preprocess(tagged_corpus):\n",
    "    words = set()\n",
    "    tags = set()\n",
    "    for sent in tagged_corpus:\n",
    "        for word, tag in sent:\n",
    "            words.add(word)\n",
    "            tags.add(tag)\n",
    "    wm = dict((w, i) for (i, w) in enumerate(words))\n",
    "    tm = dict((t, i) for (i, t) in enumerate(tags))\n",
    "    return [[(wm[w], tm[t]) for (w, t) in sent] for sent in tagged_corpus]\n",
    "\n",
    "\n",
    "# 使用timeit模块检测执行速度\n",
    "# Timer类有两个参数：一个是多次执行的代码；一个是只在开始执行一次的设置代码。\n",
    "# 例子：整数的链表 和 整数的集合 模拟10万个项目的词汇表\n",
    "# 测试声明将产生随机项，它有50%的机会出现在词汇表中。\n",
    "from timeit import Timer\n",
    "\n",
    "vocab_size = 100000\n",
    "setup_list = 'import random; vocab=range(%d)' % vocab_size\n",
    "setup_set = 'import random; vocab=set(range(%d))' % vocab_size\n",
    "statement = 'random.randint(0, %d) in vocab' % (vocab_size * 2)\n",
    "# 以前的Python集合比列表快，现在几乎没有差别\n",
    "print(Timer(statement, setup_list).timeit(1000))\n",
    "print(Timer(statement, setup_set).timeit(1000))\n",
    "vocab = range(vocab_size)"
   ]
  },
  {
   "source": [
    "### 4.7.3 动态规划(P181)\n",
    "动态规划是在自然语言处理中广泛使用的算法。\n",
    "-   解决的问题内部包含了多个重叠的子问题。\n",
    "-   算法可以避免重复计算这些子问题，而是简单地将它们的计算结果存储在一个查找表中。"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "['SSSS', 'SSL', 'SLS', 'LSS', 'LL']"
      ]
     },
     "metadata": {},
     "execution_count": 7
    }
   ],
   "source": [
    "# Ex4-9 4种方法计算梵文旋律：迭代、自底向上的动态规划、自上而下的动态规划、内置默记法\n",
    "# ToDo: 迭代算法，值得反复学习\n",
    "# 递归计算中会有重复计算的部分\n",
    "def virahanka1(n):\n",
    "    if n == 0:\n",
    "        return [\"\"]\n",
    "    elif n == 1:\n",
    "        return [\"S\"]\n",
    "    else:\n",
    "        s = [\"S\" + prosody for prosody in virahanka1(n - 1)]\n",
    "        l = [\"L\" + prosody for prosody in virahanka1(n - 2)]\n",
    "        return s + l\n",
    "\n",
    "\n",
    "virahanka1(4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "['SSSS', 'SSL', 'SLS', 'LSS', 'LL']"
      ]
     },
     "metadata": {},
     "execution_count": 8
    }
   ],
   "source": [
    "# 自底向上的动态规划\n",
    "# 将较小的实例计算结果填充到表格中，一旦得到感兴趣的值就停止，\n",
    "# 原则是解决较大问题之前先解决较小的问题，这就是自下而上的动态规划\n",
    "# 某些计算得到的子问题在解决主问题时可能并不需要，从而造成浪费\n",
    "def virahanka2(n):\n",
    "    lookup = [[\"\"], [\"S\"]]\n",
    "    for i in range(n - 1):\n",
    "        s = [\"S\" + prosody for prosody in lookup[i + 1]]\n",
    "        l = [\"L\" + prosody for prosody in lookup[i]]\n",
    "        lookup.append(s + l)\n",
    "    return lookup[n]\n",
    "\n",
    "\n",
    "virahanka2(4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "['SSSS', 'SSL', 'SLS', 'LSS', 'LL']"
      ]
     },
     "metadata": {},
     "execution_count": 9
    }
   ],
   "source": [
    "# 自上而下的动态规划：可以避免计算不需要的子问题带来的浪费\n",
    "def virahanka3(n, lookup={0: [\"\"], 1: [\"S\"]}):\n",
    "    if n not in lookup:\n",
    "        s = [\"S\" + prosody for prosody in virahanka3(n - 1)]\n",
    "        l = [\"L\" + prosody for prosody in virahanka3(n - 2)]\n",
    "        lookup[n] = s + l\n",
    "    return lookup[n]\n",
    "\n",
    "\n",
    "virahanka3(4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "['SSSS', 'SSL', 'SLS', 'LSS', 'LL']"
      ]
     },
     "metadata": {},
     "execution_count": 10
    }
   ],
   "source": [
    "# 内置默记法：使用Python的装饰器模式引入的缓存机制\n",
    "from nltk import memoize\n",
    "\n",
    "\n",
    "@memoize\n",
    "def virahanka4(n):\n",
    "    if n == 0:\n",
    "        return [\"\"]\n",
    "    elif n == 1:\n",
    "        return [\"S\"]\n",
    "    else:\n",
    "        s = [\"S\" + prosody for prosody in virahanka4(n - 1)]\n",
    "        l = [\"L\" + prosody for prosody in virahanka4(n - 2)]\n",
    "        return s + l\n",
    "\n",
    "\n",
    "virahanka4(4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "['SSSS', 'SSL', 'SLS', 'LSS', 'LL']"
      ]
     },
     "metadata": {},
     "execution_count": 11
    }
   ],
   "source": [
    "# 由functools.lru_cache实现的Python的memoization比我们的专用memoize函数更全面，就像你在CPython源代码中看到的一样\n",
    "from functools import lru_cache\n",
    "\n",
    "\n",
    "@lru_cache(1000)\n",
    "def virahanka5(n):\n",
    "    if n == 0:\n",
    "        return [\"\"]\n",
    "    elif n == 1:\n",
    "        return [\"S\"]\n",
    "    else:\n",
    "        s = [\"S\" + prosody for prosody in virahanka4(n - 1)]\n",
    "        l = [\"L\" + prosody for prosody in virahanka4(n - 2)]\n",
    "        return s + l\n",
    "\n",
    "\n",
    "virahanka5(4)"
   ]
  }
 ]
}